Run pactolus on metatlas files


In [1]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas/')
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )

from metatlas import metatlas_objects as metob
from metatlas.helpers import pactolus_tools as pt
from metatlas.helpers import dill2plots as dp

import numpy as np
import glob as glob
import os
import pandas as pd

from matplotlib import pyplot as plt

# %matplotlib notebook
%matplotlib inline
from pprint import pprint

from matplotlib import collections as mplib_collections
import matplotlib.colors as mpl_colors

Get a list of files

Through whatever means, get a list of paths to hdf5 metatlas files. These files will be read using metatlas data access tools and formated for Pactolus.

Typically you will either:

  • upload a spreadsheet and read in a list of runs
  • get a list of runs using the "retrieve" command in metatlas
  • get a list of groups using the "retrieve" command in metatlas

Regardless, preparing the list of full paths to the hdf5 metatlas files is a necessary first step.

Define the files manually


In [ ]:
%system ls -ltr $SCRATCH

In [ ]:
myfiles = ['/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_WT_M145_Day6_3of4___Run61.h5',
'/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_Ref_TwoNine_Day6_1of4___Run65.h5',
'/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_Del_TwoFour_Day6_3of4___Run21.h5']

my_groups = ['wild type','refactored','deleted']

Get files from metatlas groups


In [ ]:
#20160413_Bhedlund_pHILIC_POS_JAD2_GBS_Set1
# EMA_%_QE144_50447_20170120
temp_group = dp.select_groups_for_analysis(name = '%UV_Fungus_1to10%',
                                       most_recent = True,
                                       remove_empty = True,#'Strain=SB214'
                                       include_list = [], exclude_list = ['QC','Blank'])

# temp_group = metob.retrieve('Groups', name = '20160413%GBS%Set1', username='*')
my_files = []
my_groups = []
for i,g in enumerate(temp_group):
#     if not '32' in g.name:
#         if not '4O' in g.name:
    print g.name#,g.last_modified
    if (len(g.items) > 0):
        for f in g.items:
            #print f.hdf5_file
            my_files.append(f.hdf5_file)
            my_groups.append(g.name)
#for f in my_files:
#    print f

Get a list of files from "retrieve" os LcmsRuns from metatlas


In [2]:
my_run = metob.retrieve('LcmsRun',experiment='20170512_SK_-MR_SupprSoils_EthylAc2',name='%', username='*')
my_files = []
for m in my_run:
    if (not '_qc-' in m.hdf5_file.lower()) & (not 'injbl' in m.hdf5_file.lower()):
        my_files.append( m.hdf5_file )
my_files = np.unique(my_files)
print len(my_files)
f = [os.path.basename(f) for f in my_files]
for ff in f:
    print ff


84
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_Control-1-r1_IR2_47_47.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E1-r1_IR2_14_14.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E1-r2_IR2_41_41.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E1-r3_IR2_74_74.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E1-r4_IR2_89_89.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E1-r5_IR2_26_26.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E14-r1_IR2_110_110.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E14-r2_IR2_29_29.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E14-r3_IR2_44_44.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E14-r4_IR2_80_80.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E14-r5_IR2_32_32.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E19-r1_IR2_20_20.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E19-r2_IR2_50_50.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E19-r3_IR2_59_59.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E19-r4_IR2_56_56.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E19-r5_IR2_104_104.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E22-r1_IR2_125_125.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E22-r2_IR2_11_11.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E22-r3_IR2_53_53.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E22-r4_IR2_68_68.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E22-r5_IR2_131_131.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E23-r1_IR2_35_35.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E23-r2_IR2_65_65.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E23-r3_IR2_101_101.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E23-r4_IR2_98_98.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E23-r5_IR2_119_119.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E25-r1_IR2_92_92.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E25-r2_IR2_23_23.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E25-r3_IR2_71_71.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E25-r4_IR2_128_128.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E25-r5_IR2_62_62.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E4-r1_IR2_08_08.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E4-r2_IR2_83_83.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E4-r3_IR2_17_17.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E4-r4_IR2_86_86.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E4-r5_IR2_107_107.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E6-r1_IR2_116_116.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E6-r2_IR2_38_38.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E6-r3_IR2_95_95.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E6-r4_IR2_122_122.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_E6-r5_IR2_113_113.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_NEG_control-2-r1_IR2_77_77.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_Control-1-r1_IR1_46_46.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E1-r1_IR1_13_13.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E1-r2_IR1_40_40.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E1-r3_IR1_73_73.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E1-r4_IR1_88_88.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E1-r5_IR1_25_25.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E14-r1_IR1_109_109.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E14-r2_IR1_28_28.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E14-r3_IR1_43_43.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E14-r4_IR1_79_79.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E14-r5_IR1_31_31.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E19-r1_IR1_19_19.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E19-r2_IR1_49_49.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E19-r3_IR1_58_58.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E19-r4_IR1_55_55.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E19-r5_IR1_103_103.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r1_IR1_124_124.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r2_IR1_10_10.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r3_IR1_52_52.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r4_IR1_67_67.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r5_IR1_130_130.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r1_IR1_34_34.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r2_IR1_64_64.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r3_IR1_100_100.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r4_IR1_97_97.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r5_IR1_118_118.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E25-r1_IR1_91_91.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E25-r2_IR1_22_22.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E25-r3_IR1_70_70.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E25-r4_IR1_127_127.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E25-r5_IR1_61_61.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E4-r1_IR1_07_07.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E4-r2_IR1_82_82.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E4-r3_IR1_16_16.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E4-r4_IR1_85_85.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E4-r5_IR1_106_106.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E6-r1_IR1_115_115.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E6-r2_IR1_37_37.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E6-r3_IR1_94_94.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E6-r4_IR1_121_121.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E6-r5_IR1_112_112.h5
20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_control-2-r1_IR1_76_76.h5

Get a list of files from a spreadsheet


In [ ]:
import pandas as pd
# files = pd.read_csv('phylogenetic c18 files.csv')
files = pd.read_excel('all cameron files_RCC_selected (2).xls')
# print files.head()
all_myfiles = files['full path'].tolist()
pat = 'P_halotolerans'
myfiles = []
for f in all_myfiles:
    if pat in f:
        myfiles.append(f)
print len(myfiles)
print myfiles

Specify directory for working in

The Pactolus jobs can create a large amount of temporary results. These will likely fill your home directory.

  • $SCRATCH
  • /project/projectdirs/metatlas/projects
  • /project/projectdirs/openmsi/projects are all good places you can create your folders.

I usually create my temporary storage here:

target_dir = '/project/projectdirs/openmsi/projects/ben_run_pactolus/test_tools_4'

Replace test_tools_4 with an appropriately named temporary folder.


In [ ]:
root_path = %system echo $SCRATCH
root_path = root_path[0]
# root_path = root_path.replace('/cscratch1/sd','/scratch2/scratchdirs').replace('/global','')
root_path

In [3]:
root_path = '/project/projectdirs/metatlas/projects/jgi_projects/'

In [4]:
# target_dir = os.path.join(root_path,'pactolus_runs','20161011_actinorhodin')
target_dir = os.path.join(root_path,'Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2')
if not os.path.isdir(target_dir):
    os.mkdir(target_dir)
target_dir
#     '/project/projectdirs/openmsi/projects/ben_run_pactolus/20161011_actinorhodin/'


Out[4]:
'/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2'

In [5]:
# %cat $target_dir/20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-NEG_MSMLS-P2-RD_IR2_31_31.h5_polarity_0.sbatch

In [6]:
%system ls $target_dir


Out[6]:
[]

In [7]:
pt = reload(pt)
pt.create_pactolus_msms_data_container(my_files,target_dir,3e5,min_rt = 0.5, max_rt = 30.0,make_container=True)


Out[7]:
'/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/container_file_polarity_1.h5'

In [ ]:
# %system cat /global/cscratch1/sd/bpb/pactolus_runs/Cori_20161031_KBL_C18_HO_SecMet_Plate1to3/*.sbatch

In [8]:
with open('/global/homes/b/bpb/Downloads/pactolus_jobs_20170512_SK_-MR_SupprSoils_EthylAc2.sh','w') as fid:
    check_path = os.path.join(target_dir,'*.sbatch')
    sbatch_files = %system ls $check_path
    for s in sbatch_files:
        fid.write('%s %s\n'%('sbatch',s))

In [ ]:
len(sbatch_files)

In [ ]:
for sf in sbatch_files:
    print 'sbatch',sf

In [ ]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)

df = %squeue -u bpb -p realtime
print len(myfiles), df.shape

In [ ]:
from IPython.display import HTML
import pandas as pd
h = HTML(df.to_html())
with open('current_jobs.html', 'w') as my_file:
    my_file.write(h.data)

Submit your jobs to the Cori realtime queue


In [ ]:
pt = reload(pt)
mydirs = ['rexmalm_pos_pellet']
for d in mydirs:
    root_path = %system echo $SCRATCH
    root_path = root_path[0]
    target_dir = os.path.join(root_path,'pactolus_runs',d)
    print d
    my_session = pt.submit_all_jobs(target_dir)#,usr='bpb',pwd='Bron$e00',)

Check the status of your jobs


In [ ]:
pt = reload(pt)
jobs = pt.check_job_status()

In [ ]:
%system ls /project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/*.sbatch

Hacking on spectral similarity


In [ ]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/pactolus/')
from pactolus import score_frag_dag_wip as sfd
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
import pandas as pd

In [ ]:
q = """
select rtreferences.rt_peak, rtreferences.rt_min, rtreferences.rt_max, rtreferences.rt_units, rtreferences.last_modified, rtreferences.username
from rtreferences, compoundidentifications, compoundidentifications_rt_references, compoundidentifications_compound, compounds
where
rtreferences.unique_id = compoundidentifications_rt_references.target_id and
compoundidentifications.unique_id = compoundidentifications_rt_references.source_id and
compoundidentifications.unique_id = compoundidentifications_compound.source_id and
compounds.unique_id = compoundidentifications_compound.target_id and
compounds.inchi_key = 'MTCFGRXMJLQNBG-REOHCLBHSA-N'
"""
result = [e for e in metob.database.query(q)]

In [ ]:
df = pd.DataFrame(entries)
df.head()

In [ ]:
# fragmentationreferences.precursor_mz, fragmentationreferences.polarity, fragmentationreferences.collision_energy, fragmentationreferences.technique, fragmentationreferences.username

In [ ]:
q = """
select fragmentationreferences.precursor_mz, fragmentationreferences.polarity, fragmentationreferences.collision_energy, fragmentationreferences.technique, fragmentationreferences.username
from fragmentationreferences, compoundidentifications, compoundidentifications_frag_references, compoundidentifications_compound, compounds
where
fragmentationreferences.unique_id = compoundidentifications_frag_references.target_id and
compoundidentifications.unique_id = compoundidentifications_frag_references.source_id and
compoundidentifications.unique_id = compoundidentifications_compound.source_id and
compounds.unique_id = compoundidentifications_compound.target_id and
compounds.inchi_key = 'MTCFGRXMJLQNBG-REOHCLBHSA-N'
"""
result = [e for e in metob.database.query(q)]

In [ ]:
df = pd.DataFrame(entries)
df.head()

Hacking on getting the serial jobs to go faster


In [ ]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/pactolus/')
from pactolus import score_frag_dag_wip as sfd

In [ ]:
sfd = reload(sfd)
container_file = '/project/projectdirs/metatlas/projects/pactolus_runs/20170425_EMA_For_MAGI_PAPER/container_file_polarity_1.h5'
sample_file = '20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5'
# input_file = container_file + ':' + sample_file
tree_file = '/project/projectdirs/metatlas/projects/clean_pactolus_trees/tree_lookup.npy'
out_file = '/project/projectdirs/metatlas/projects/magi_paper/test_pactolus.h5'
#results is just the score matrix
results = sfd.score_main(use_command_line=False,
                     ms1_mass_tolerance=0.01,
                     ms2_mass_tolerance=0.01,
                     max_depth=3,
                     neutralizations=[-1.00727646677,-2.0151015067699998,0.00054857990946],
                     schedule='',
                     trees=tree_file,
                     metabolite_database=None,
                     input_filepath=container_file,
                     input_grouppath=sample_file,
                     output_filepath=out_file,
                     output_grouppath='asdf',
                     match_matrix=False,
                     precursor_mz=-1,
                     pass_scanmeta=True,
                     pass_scans=True,
                     loglevel='ERROR', #use ERROR,INFO, or DEBUG
                     tempdir='/project/projectdirs/metatlas/projects/pactolus_tempdir',
                     clean_tempdir=True,
                     clean_output=True,
                     pass_compound_meta=False,
                     start_barrier=False)
# This creates a temp file and that temp file is then converted into the standard pactolus file

In [ ]:
%system mkdir /project/projectdirs/metatlas/projects/pactolus_tempdir

In [ ]:
# jobs = %system cat /project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/*.sbatch | grep 'srun -n 32 -N 1'
# for j in jobs:
#     print j.replace('srun -n 32 -N 1 python','%run').replace('/tmp/packages/pactolus/','/project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/')
#     print ''

In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --precursor_mz -1 --match_matrix False --clean_output True

In [ ]:
# sys.path.insert(1,'/global/homes/b/bpb/metaiq/pactolus/')
# import no_bastet_score_frag_dag as pactolus

In [ ]:
# scan_list, scan_metadata, experiment_metadata = load_scan_data_hdf5(filepath=input_filepath,
#                                                                        grouppath=input_grouppath)

# #     # Determine the ms1_mz, i.e., the precursor m/z
# #     ms1_mz = scan_metadata['ms1_mz'] if 'ms1_mz' in scan_metadata else None

# #     file_lookup_table = load_file_lookup_table(path=trees)

# #     # size input variables
# #     num_scans = len(scan_list)
# #     num_compounds = len(file_lookup_table)

# #     results = score_scan_list_against_trees(scan_list=scan_list,
# #                                             ms1_mz=ms1_mz,
# #                                             file_lookup_table=file_lookup_table,
# #                                             neutralizations=neutralizations,
# #                                             ms2_mass_tol=ms2_mass_tol,
# #                                             ms1_mass_tol=ms1_mass_tol,
# #                                             max_depth=max_depth,
# #                                             temp_out_group=temp_out_group,
# #                                             want_match_matrix=match_matrix,
# #                                             )

In [ ]:
# polarity = 0
# import os
# sfd = '/global/homes/b/bpb/metaiq/pactolus/score_frag_dag.py'
# cf = os.path.join(target_dir,'container_file_polarity_%d.h5'%polarity)
# import h5py
# with h5py.File(cf,'r') as hf:
#     dfs = hf.keys()
# ofs = [os.path.join(target_dir,'pactolus_results_%s'%f) for f in dfs]

# of = ofs[0]
# df = dfs[0]
# tp = '/project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy'
# pos_neut = '[-1.00727646677,-2.0151015067699998,0.00054857990946]'
# neg_neut = '[+1.00727646677,+2.0151015067699998,-0.00054857990946]'

In [ ]:
# %system cat $sfd
# %system ls -lt $target_dir

In [ ]:
# %system ps -lu bpb

In [ ]:
# print job_script

In [ ]:
# params = (sfd,cf,df,neg_neut,tp,of)
# #job_script = '%s --input "%s:%s" --neutralizations "%s" --trees %s --ms1_mass_tolerance 0.015 --ms2_mass_tolerance 0.015 --max_depth 5 --save "%s" --match_matrix False'%params

# job_script = '%s --clean_tempdir True --loglevel DEBUG --input "%s:%s" --neutralizations "%s" --trees %s --ms1_mass_tolerance 0.015 --ms2_mass_tolerance 0.015 --max_depth 5 --save "%s" --precursor_mz -1 --match_matrix False --clean_output True'%params
# %run $job_script

In [ ]:
# %tb

In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5" --precursor_mz -1 --match_matrix False --clean_output True

In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --precursor_mz -1 --match_matrix False --clean_output True

In [ ]:
# #%run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/container_file_polarity_0.h5:20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_raw-BR3_IR2_28.h5" --neutralizations "[1.00727646677,2.0151015067699998,-0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/pactolus_results_20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_raw-BR3_IR2_28.h5" --precursor_mz -1 --match_matrix False --clean_output True
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py 
# --clean_tempdir True 
# --loglevel DEBUG 
# --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/container_file_polarity_0.h5:20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_cooked-BR1_IR2_10.h5" 
# --neutralizations "[1.00727646677,2.0151015067699998,-0.00054857990946]" 
# --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy 
# --ms1_mass_tolerance 0.025 
# --ms2_mass_tolerance 0.025 
# --max_depth 5 
# --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/pactolus_results_20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_cooked-BR1_IR2_10.h5" 
# --precursor_mz -1 --match_matrix False --clean_output True

After jobs finish, look for failed runs (missing hdf5 files)


In [ ]:
f = pt.check_for_failed_jobs(target_dir)

Read your results


In [ ]:
#pt = reload(pt)
#metatlas_name,neutral_inchi,neutral_mass =pt.get_neutral_inchi_and_name(use_pickle=True)

In [ ]:
# target_dir = '/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/'

In [ ]:
import glob as glob
import os
import h5py
files = glob.glob(os.path.join(target_dir,'*.h5'))
for i,f in enumerate(files):
    print 'File=%d\t%s'%(i,os.path.basename(f))
    with h5py.File(f,'r') as output_file:
        my_keys = output_file.keys()
#         counter = 0
#         for k in my_keys:
#             if 'match_matrix' not in k:
#                 counter = counter +1
#         try:
#             score_matrix = output_file['score_matrix'][:]
#             num = score_matrix.shape[0]
#         except:
#             num = 0
        
#     print 'File=%d\tKeys=%d\tScans=%d\t%s'%(i,counter,num,os.path.basename(f))

In [ ]:
with h5py.File(os.path.join(target_dir,'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5')) as output_file:
    print output_file.keys()
    #[a[0] for a in output_file['tree_file_lookup_table']]

In [ ]:
pt = reload(pt)

all_dfs = pt.make_output_tables(target_dir)
#,metatlas_name,neutral_inchi,neutral_mass,score_cutoff=0.001,intensity_min=1e5,to_excel=True)
#TODO: This takes a surprisingly long time to run for each file.

merge_sheets applies a delta mass filter


In [ ]:
pt = reload(pt)

my_sheets = glob.glob(os.path.join(target_dir,'pactolus_results_*.csv'))
print my_sheets

df_all_files = pt.merge_sheets(my_sheets)

my_piv=df_all_files.pivot(columns='source_file')

In [ ]:
ref_df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds.pkl')
#df = pd.read_csv('/project/projectdirs/openmsi/projects/compound_data/jgi_molecules/new_jgi_compounds.csv')
# df.rename(columns = {'monoisotopoic_mw':'monoisotopic_mw'},inplace=True)

print ref_df.keys()
print df_all_files.keys()
print df_all_files.shape

In [ ]:
temp= df_all_files.reset_index()
# pd.merge(restaurant_ids_dataframe, restaurant_review_frame, on='business_id', how='outer')

big_data = pd.merge(temp,ref_df,on='inchi_key')#.reset_index()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 40)
print big_data.shape
big_data.head(10)

In [ ]:
big_data.to_csv(os.path.join(target_dir,'named_pactolus_all_results_out.csv'))#, index=False)

In [ ]:
my_piv.to_csv(os.path.join(target_dir,'pactolus_pivot_out.csv'))#, index=False)
df_all_files.to_csv(os.path.join(target_dir,'pactolus_all_results_out.csv'))#, index=False)

In [ ]:
my_piv.head()

In [ ]:
import time
import os
def make_tarball(output_dir,do_timestr=True,csv_only=False):
    if do_timestr:
        timestr = time.strftime("%Y%m%d-%H%M%S") + '_'
    else:
        timestr = ''
    tarball_name = timestr + os.path.basename(os.path.normpath(output_dir)) + '.tar.gz'
    if not csv_only:
        %system tar -zcf $tarball_name -C $output_dir .
    else:
        temp = os.path.join(output_dir,'*.csv')
        %system tar -zcf $tarball_name $temp
    print 'done'
    from IPython.core.display import display, HTML
    f1 = os.path.join(os.getcwd(), tarball_name).replace('/global/u2/b/bpb','/user/bpb/notebooks')
    print f1
    f2 = tarball_name
    display(HTML('<a href="%s" download="%s">Start automatic download!</a>'%(f1,f2)))
make_tarball(target_dir,csv_only=True)
# https://jupyter-dev.nersc.gov/user/bpb/files/notebooks/Pactolus%20Tools/20161108-131105_20161011_actinorhodin.tar.gz

In [ ]:
intensity = my_piv['precursor intensity']
files = intensity.keys()
print files
# plt.plot(intensity[files[2]],intensity[files[1]],'.')
# ax = plt.gca()
# ax.set_xscale('log')
# ax.set_yscale('log')
# ax.set_xlabel(files[0])
# ax.set_ylabel(files[1])
# plt.show()

re-import just the pivot if needed


In [ ]:
my_piv = pd.read_csv(os.path.join(target_dir,'pactolus_pivot_out.csv'))

import a network layout


In [ ]:
# pt = reload(pt)
# network = pt.import_network()
def network_cyjs_to_dataframe(network_file='network_v1p0.cyjs'):
    import json
    with open(network_file) as data_file:    
        data = json.load(data_file)
    nodes = pd.DataFrame(data['elements']['nodes'])
    node_data = pd.DataFrame(nodes.data.to_dict()).T
    node_position = pd.DataFrame(nodes.position.to_dict()).T
    node_data = node_data.join(node_position)
    node_data.x = node_data.x.astype(float)
    node_data.y = node_data.y.astype(float)
    
    edges = pd.DataFrame(data['elements']['edges'])
    edge_data = pd.DataFrame(edges.data.to_dict()).T

    edge_data.source = edge_data.source.astype(int)
    edge_data.target = edge_data.target.astype(int)
    node_data.SUID = node_data.SUID.astype(int)

    edge_data = edge_data.merge(node_data[['SUID','x','y']], how='inner', left_on='source', right_on='SUID')
    edge_data.rename(columns={'x':'source_x','y':'source_y'},inplace=True)
    edge_data = edge_data.merge(node_data[['SUID','x','y']], how='inner', left_on='target', right_on='SUID')
    edge_data.rename(columns={'x':'target_x','y':'target_y'},inplace=True)

    return (node_data,edge_data)

(nodes,edges) = network_cyjs_to_dataframe()

In [ ]:
nodes.keys()

In [ ]:
nodes.head(10)

In [ ]:
big_data2 = pd.merge(big_data,nodes,on='inchi_key')#.reset_index()
big_data2.head()

In [ ]:
colors = []
segs = []

norm = mpl_colors.Normalize(vmin=edges.weight.min(), vmax=edges.weight.max())
cmap = plt.cm.plasma
my_color = plt.cm.ScalarMappable(norm=norm, cmap=cmap)

for i,row in edges.iterrows():
    colors.append(my_color.to_rgba(row.weight))
    segs.append(( (row.source_x, row.source_y),
                 (row.target_x,row.target_y) ))

In [ ]:
fig = plt.figure(figsize=(15,15),facecolor='black')
ax = plt.gca()

myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
myEdges = [0.126700401, 0.00487433, 0.32941519] #

plt.scatter(nodes['x'],nodes['y'], s=4, c='white', alpha=0.015,lw = 0)

vertices = list(zip(zip(edges.source_x,edges.source_y),zip(edges.target_x,edges.target_y)))

ln_coll = mplib_collections.LineCollection(segs,colors = colors,alpha = 1,linewidth=1)

ax.add_collection(ln_coll)
# fyi: if you don't draw the scatter points then you need to set the axes limits manually
plt.axis('off')
plt.show()

In [ ]:
#map my_piv inchi_key to str.contains() for nodes.

In [ ]:
len(myfiles)
my_piv.shape

In [ ]:
(33,
  'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_5_HighExp_Media_Fungus__1001_A__Run24.csv'),
 (58,
  'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_14_HighExp_Media_Control_Conditioned_1001_A__Run18.csv'),
     (64,
  'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_1_LowExp_Media_Fungus__1001_A__Run21.csv'),

In [ ]:
my_piv.keys()

In [ ]:
small_table = my_piv[['Unnamed: 0','precursor intensity']]
small_table = small_table[small_table['precursor intensity'] > 0]
small_table.rename(columns = {'Unnamed: 0':'inchi_key'}, inplace = True)
small_table.drop(0,inplace=True)
small_table.head()

In [ ]:
# my_file = my_piv.columns.get_level_values(1)[0]
# print my_file
# small_table = my_piv['precursor intensity'][my_file].reset_index()
# small_table = small_table[np.isfinite(small_table[my_file])]
small_table = my_piv[['Unnamed: 0','precursor intensity']]
small_table = small_table[small_table['precursor intensity'] > 0]
small_table.rename(columns = {'Unnamed: 0':'inchi_key'}, inplace = True)
small_table.drop(0,inplace=True)

print small_table.shape
# small_table.head()
for i,row in small_table.iterrows():
    idx = nodes.inchi_key.str.contains(row.inchi_key)
    if sum(idx) > 0:
        small_table.loc[i,'x'] = nodes[idx]['x'].tolist()[0]
        small_table.loc[i,'y'] = nodes[idx]['y'].tolist()[0]

# pos = small_table.inchi_key.apply(lambda x: nodes[nodes.inchi_key.str.contains(x)][['x','y']])

In [ ]:
small_table.head()

hits not in network


In [ ]:
not_in_network = small_table[~np.isfinite(small_table['x'])]
not_in_network.to_csv('solar_pactolus_hits_not_in_network.csv')
# nodes[nodes.inchi_key.str.contains(small_table.inchi_key.tolist()[0])][['x','y']]

In [ ]:
small_table = small_table[np.isfinite(small_table['x'])]
small_table.shape

In [ ]:
from matplotlib import collections as mplib_collections
import matplotlib as mpl
import matplotlib.cm as cm

fig = plt.figure(figsize=(50,50),facecolor='cornsilk')
ax = plt.gca()

myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
myEdges = [0.126700401, 0.00487433, 0.32941519] #

plt.scatter(nodes['x'],nodes['y'], s=0.1, c='white', alpha=0.015,lw = 0)

vertices = list(zip(zip(edges.source_x,edges.source_y),zip(edges.target_x,edges.target_y)))

ln_coll = mplib_collections.LineCollection(segs,colors = colors,alpha = 0.251,linewidth=2)

ax.add_collection(ln_coll)
# fyi: if you don't draw the scatter points then you need to set the axes limits manually
norm = mpl.colors.Normalize(vmin=small_table[my_file].min()**0.5, vmax=small_table[my_file].max()**0.5)
cmap = cm.viridis
my_color = cm.ScalarMappable(norm=norm, cmap=cmap)
# plt.scatter(small_table['x'],small_table['y'], s=12, c='white', alpha=1,lw = 0)
plt.scatter(small_table['x'],small_table['y'], s=20, c=my_color.to_rgba(small_table[my_file]**0.5), alpha=1,lw = 0)

# for i,row in small_table.iterrows():
#     plt.text(row.x,row.y,row['metatlas name'])

# plt.xlim(-150000,-80000)
# plt.ylim(40000,100000)
# plt.axis('off')
temp= my_file.replace('pactolus_results_','').replace('.xls','')
ax.set_title(temp.replace('.csv',''),color='b',fontsize=14)
plt.show()
fig.savefig('HighExp_Media_Control.pdf',facecolor='cornsilk')

In [ ]:
from matplotlib import collections as mplib_collections
import matplotlib as mpl
import matplotlib.cm as cm

import os
my_groups = files
sub_string = os.path.commonprefix(files)
new_groups = [s.replace(sub_string,'').replace('Set','') for s in my_groups]

norm = mpl.colors.Normalize(vmin=0, vmax=1)
cmap = cm.plasma

my_color = cm.ScalarMappable(norm=norm, cmap=cmap)

#w,h
fig = plt.figure(figsize=(15,30),facecolor='black')

num_files = 2#len(list(set(my_piv.columns.get_level_values(1).tolist())))
# print p.index.get_level_values(0)
# p.xs('precursor intensity',axis=1,level=0)[p.columns.get_level_values(1)[0]]
for iii in range(num_files):
    my_file = my_piv.columns.get_level_values(1)[iii]
#     print my_file
    small_table = my_piv['precursor intensity'][my_file]
    # df_all_files.reset_index(inplace=True,drop=True)
    # df_all_files.head()

    import numpy as np

    df = pd.DataFrame(small_table).fillna(0).reset_index()
    names = df[df[my_file]>0]['metatlas name'].tolist()
    print df.keys()
    vals = np.asarray(df[df[my_file]>0][my_file])

    idx = np.argwhere(vals > 0).flatten()
    temp = vals[idx]
    temp = np.log10(temp)
    temp = temp - np.min(temp)
    temp = temp / np.max(temp)
    vals[idx] = temp


    M = np.zeros((len(network['node_name'])))
    idx = []
    not_found = 0
    for i,n in enumerate(network['node_name']):
        try:
            idx = names.index(n)
            M[i] = vals[idx]
        except:
            not_found += 1
#     print not_found, i, len(vals)

    plt.subplot(2,1,iii+1)
    myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
    myEdges = [0.126700401, 0.00487433, 0.32941519] #
    idx_z = np.argwhere(M[:] == 0).flatten()
    idx_nz = np.argwhere(M[:] > 0.2).flatten()
#     plt.plot(network['x'][idx_z],network['y'][idx_z],'o',markersize=1, markeredgecolor='k', markerfacecolor = 'k',alpha=0.3)
    plt.scatter(network['x'][idx_nz],network['y'][idx_nz], s=10, c=my_color.to_rgba(M[idx_nz]), alpha=1,lw = 0)
    plt.scatter(network['x'][idx_z],network['y'][idx_z], s=1, c='blue', alpha=0.1,lw = 0)
#     plt.plot(network['x'][idx_nz],network['y'][idx_nz],'o',markersize=2, markeredgecolor=my_color.to_rgba(M[idx_nz]), markerfacecolor = my_color.to_rgba(M[idx_nz]))
    colors = []
    segs = []
    for e in network['data']['elements']['edges']:
        idx1 = np.argwhere(network['node_id'] == float(e['data']['source']))[0][0]
        idx2 = np.argwhere(network['node_id'] == float(e['data']['target']))[0][0]
        colors.append('grey')
        segs.append(( (network['x'][idx1], network['y'][idx1]),
                     (network['x'][idx2], network['y'][idx2]) ))

    ln_coll = mplib_collections.LineCollection(segs, colors=colors,alpha=0.3)

    ax = plt.gca()
    ax.add_collection(ln_coll)
    
    temp= my_file.replace('pactolus_results_','').replace('.xls','')
    a = [g for i,g in enumerate(new_groups) if temp in myfiles[i]]
    
    if 'blank' in my_file:
        ax.set_title('blank',color='w')
    elif a:
        ax.set_title(a[0],color='w')#my_file.replace('pactolus_results_','').replace('.xls','').replace('20150322','').replace('20151124','').replace('_MSMS_','').replace('','').replace('pHILIC_',''),color='w')
    else:
        ax.set_title(temp,color='w')
    # fig.set_facecolor('w')
    # ax.set_xlim(0, 600)    
    # ax.set_ylim(0, 400)
    # plt.draw()
    plt.axis('off')
plt.show()
fig.savefig(my_file.replace('pactolus_results_','').replace('.xls','')+'_networks_from_pactolus.pdf',facecolor='black')
#     plt.close(fig)

In [ ]:
sub_string = os.path.commonprefix([my_piv.columns.get_level_values(1)[iii] for iii in range(39)])
for iii in range(num_files):
    my_file = my_piv.columns.get_level_values(1)[iii]
    small_table = my_piv['precursor intensity'][my_file]
    df = pd.DataFrame(small_table).fillna(0).reset_index()
    vals = np.asarray(df[df[my_file]>0][my_file])

    idx = np.argwhere(vals > 0.2).flatten()
    
#     temp= my_file.replace('pactolus_results_','').replace('.xls','')
#     a = [g for i,g in enumerate(new_groups) if temp in myfiles[i]]

    print len(idx),'\t','','\t',my_file.replace(sub_string,'')

In [ ]:
for iii in range(39):
    my_file = my_piv.columns.get_level_values(1)[iii]
#     print my_file

In [ ]:

compound found in all replicates of groups


In [ ]:
num_files = len(list(set(my_piv.columns.get_level_values(1).tolist())))

In [ ]: